library(tidyverse)
library(scales)
theme_set(theme_classic())
employed <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv')
earn <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/earn.csv')
Let’s use k-means clustering to explore [employment by race and gender].
Explore data
employed %>%
count(race_gender)
## # A tibble: 6 x 2
## race_gender n
## <chr> <int>
## 1 Asian 1254
## 2 Black or African American 1386
## 3 Men 1386
## 4 TOTAL 1386
## 5 White 1386
## 6 Women 1386
employed_tidied <- employed %>%
filter(!is.na(employ_n)) %>%
group_by(occupation = paste(industry, minor_occupation),
race_gender) %>%
summarize(n = mean(employ_n)) %>%
ungroup()
## `summarise()` has grouped output by 'occupation'. You can override using the `.groups` argument.
Get the data ready for k-means
employed_tidied %>%
filter(race_gender == "TOTAL")
## # A tibble: 239 x 3
## occupation race_gender n
## <chr> <chr> <dbl>
## 1 Agriculture and related Construction and extraction occu… TOTAL 1.22e4
## 2 Agriculture and related Farming, fishing, and forestry o… TOTAL 9.56e5
## 3 Agriculture and related Installation, maintenance, and r… TOTAL 3.23e4
## 4 Agriculture and related Manage-ment, business, and finan… TOTAL 1.01e6
## 5 Agriculture and related Management, business, and financ… TOTAL 1.04e6
## 6 Agriculture and related Office and administrative suppor… TOTAL 8.58e4
## 7 Agriculture and related Production occupations TOTAL 3.52e4
## 8 Agriculture and related Professional and related occupat… TOTAL 4.92e4
## 9 Agriculture and related Protective service occupations TOTAL 1.47e4
## 10 Agriculture and related Sales and related occupations TOTAL 1.57e4
## # … with 229 more rows
employed_demo <- employed_tidied %>%
filter(race_gender %in% c("Women", "Black or African American", "Asian")) %>%
pivot_wider(names_from = race_gender, values_from = n, values_fill = 0) %>%
janitor::clean_names() %>%
left_join(employed_tidied %>%
filter(race_gender == "TOTAL") %>%
select(-race_gender) %>%
rename(total = n)) %>%
filter(total > 1e4) %>%
mutate(across(c(asian, black_or_african_american, women), ~ . / total),
total = log(total),
across(is.numeric, ~ as.numeric(scale(.)))) %>%
mutate(occupation = snakecase::to_snake_case(occupation))
## Joining, by = "occupation"
## Warning: Predicate functions must be wrapped in `where()`.
##
## # Bad
## data %>% select(is.numeric)
##
## # Good
## data %>% select(where(is.numeric))
##
## ℹ Please update your code.
## This message is displayed once per session.
We are now ready to explore which occupations are most likely to be together in terms of demographic representations among these categories we have and total.
Implement k-means clustering!
employed_clust <- kmeans(select(employed_demo, -occupation), centers = 3)
summary(employed_clust)
## Length Class Mode
## cluster 211 -none- numeric
## centers 12 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
library(broom)
tidy(employed_clust)
## # A tibble: 3 x 7
## asian black_or_african_american women total size withinss cluster
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
## 1 -0.842 -0.728 -0.993 -0.582 63 95.7 1
## 2 0.722 -0.208 0.719 0.746 89 233. 2
## 3 -0.190 1.09 -0.0237 -0.504 59 116. 3
augment(employed_clust, employed_demo) %>%
ggplot(aes(total, women, color = .cluster)) +
geom_point(alpha = .8)

Choosing K
kclusts <- tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~ kmeans(select(employed_demo, -occupation), .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, employed_demo)
)
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_line(alpha = .8) +
geom_point(size = 2)

library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
employed_clust <- kmeans(select(employed_demo, -occupation), centers = 5)
p <- augment(employed_clust, employed_demo) %>%
ggplot(aes(total, women, color = .cluster, name = occupation)) +
geom_point(alpha = .8)
ggplotly(p)